#!/usr/bin/env python
#-----------------------------------------------------------------------------
# Name:        sphinx-wxoptimize
# Purpose:     Convert sphinx htmlhelp to format optimized for wxHtmlHelp
#
# Author:      Rob McMullen
#
# Created:     2009
# Copyright:   (c) 2009 Rob McMullen
# License:     GPL
#-----------------------------------------------------------------------------
"""Convert sphinx-generated htmlhelp files into versions optimized for reading
using the wxHtmlHelp browser

The sphinx document processor generates HTML that uses more advanced syntax
than the wxHtmlHelp system can render.  For an example of one of the problems
corrected with this script: sphinx uses CSS for the header and footer
navigation links (previous, next, etc.), which isn't rendered properly in
wxHtmlHelp.  This script turns those links into a table which is rendered
correctly with wxHtmlHelp.

This tool is designed to be called as a post-processing step to the 'make
htmlhelp' command used in sphinx processing.  A convenient way to use this
script is to integrate it in to the makefile by adding this command to the
htmlhelp target:

    sphinx-wxoptimize _build/htmlhelp

or because _build/htmlhelp is the default source directory, simply:

    sphinx-wxoptimize

The script is also capable of operating on individual files for testing
purposes.  Without any arguments, the script will process any files in the
_build/htmlhelp directory.
"""

import os, sys, glob, shutil, re
from BeautifulSoup import BeautifulSoup, Tag, NavigableString

def convert(filename, options):
    """Convert a single HTML file from the htmlhelp format of sphinx to a
    version optimized for reading using the wxHtmlHelp browser
    
    """
    fh = open(filename)
    text = fh.read()
    fh.close()
    conf = getConf(options)
    
    soup = BeautifulSoup(text)
    removeCSS(soup)
    removeSearch(soup, filename, conf)
    addAnchors(soup)
    convertNavigation(soup)
    removeUnnecessaryDocumentDivs(soup)
    removeUnnecessaryHighlightDivs(soup)
    fixAlignCenter(soup)
    fixDefinitionLists(soup)
    fixIndexTargets(soup)
    
    if options.destdir:
        if not os.path.exists(options.destdir):
            os.mkdir(options.destdir)
        outfile = os.path.join(options.destdir, os.path.basename(filename))
    else:
        outfile = filename
    if options.testprefix:
        outfile = options.testprefix + filename
    fh = open(outfile, "w")
    fh.write(str(soup))
    fh.close()

def removeCSS(soup):
    """CSS does nothing in the wxHtmlHelp, so it is removed
    
    The wxHtmlHelp system doesn't recognize CSS or any other <link> tags in the
    header, so they are removed.
    """
    results = soup.findAll("link")
    for result in list(results):
        result.extract()

def removeSearch(soup, filename, conf):
    """The search page doesn't exist in htmlhelp, so the link is removed from
    the master index.
    """
    filename = os.path.basename(filename)
    if 'master_doc' in conf:
        # The master index is set in the config file.
        prefix = conf['master_doc'] + "."
        if filename.startswith(prefix):
            #print("Found index!!!")
            
            # It seems that the search page is hardcoded to be "search.html"
            anchors = soup.findAll("a", attrs={"href": "search.html"})
            for anchor in list(anchors):
                #print("Found: %s" % anchor)
                #print("Parent: %s" % anchor.parent)
                anchor.parent.extract()

def addAnchors(soup):
    """Add href anchors for sections
    
    The wxHtmlHelp system doesn't recognize the divs or spans used by sphinx to
    indicate sections, so clicking on section links in unmodified htmlhelp pops
    up error dialogs.  To conform to wxHtmlHelp's more limited html, <a name>
    tags are inserted after each section heading and named span tag.
    """
    results = soup.findAll("div", "section")
    for result in list(results):
        #print result.name
        #print result["id"]
        #print result.attrs
        tag = Tag(soup, "a", attrs=[("name", result["id"])])
        result.insert(0, tag)
        replaceWithContents(result)
    
    # Add anchors for <span id="name"> tags that are generated by htmlhelp
    results = soup.findAll("span")
    for result in list(results):
        #print result.name
        #print result.attrs
        found = False
        for name, value in result.attrs:
            if name == "id":
                found = True
            elif name == "class":
                found = False
                break
        if found:
            #print result["id"]
            tag = Tag(soup, "a", attrs=[("name", result["id"])])
            result.replaceWith(tag)

def convertNavigation(soup):
    """Change the navigation from a css-styled ul to a table
    
    The css lays out two separate sections, one on the left side and one on the
    right side.  This is mimicked by a two column table with the right column
    using an "align=right" entity.
    """
    nav_lists = soup.findAll("div", "related")
    for nav in nav_lists:
        #print nav
        items = nav.findAll("li", attrs={"class": "right"})
        items.reverse()
        #print "items: %s" % items
        right = Tag(soup, "p")
        for item in items:
            # Remove the item so the later findAll will only find those list
            # items that haven't already been processed.
            item.extract()
            
            #print "contents (%d): %s" % (len(item.contents), item.contents)
            # Have to iterate over a copy because the append operation rips the
            # element out of the contents list and messes up the loop
            for a in list(item.contents):
                #print "a = %s" % a
                right.append(a)
                
        items = nav.findAll("li")
        left = Tag(soup, "p")
        for item in items:
            for a in list(item.contents):
                #print "a = %s" % a
                left.append(a)
        
        table = Tag(soup, "table")
        tr = Tag(soup, "tr")
        table.append(tr)
        td = Tag(soup, "td", attrs=[("width", "50%")])
        tr.append(td)
        td.append(left)
        td = Tag(soup, "td", attrs=[("width", "50%"), ("align", "right")])
        tr.append(td)
        td.append(right)
        
        nav.replaceWith(table)

def removeUnnecessaryDocumentDivs(soup):
    """The div document wrappers add unnecessary whitespace
    
    The three <div> elements that make up the body of the document only serve
    to add blank lines before the first section heading when viewed in the
    wxHtmlHelp browser:
    
    <div class="document">
      <div class="documentwrapper">
          <div class="body">
    
    This method loops over the contents of <div class="body"> and replaces <div
    class="document"> with these contents, effectively removing those three
    levels of divs.
    """
    nav_lists = soup.findAll("div", "document")
    for nav in nav_lists:
        body = nav.find("div", "body")
        nav.replaceWith(body)
        replaceWithContents(body)

def removeUnnecessaryHighlightDivs(soup):
    """The div wrappers used to highlight preformatted text only serve to
    introduce extra line breaks.
    
    All divs of the form
    
    <div class="highlight-****">
    
    are replaced by their contents
    """
    nav_lists = soup.findAll("div", attrs={'class': re.compile("highlight-.*")})
    for nav in nav_lists:
        #print("Found div: %s" % nav)
        replaceWithContents(nav)

def fixAlignCenter(soup):
    """The <div align=center> used to center images doesn't place a blank line
    before the image.
    
    This routine inserts an empty <p> tag before the <img> tag to create the
    space between the preceding text and the image.
    """
    nav_lists = soup.findAll("div", "align-center")
    for nav in nav_lists:
        #print("Found align-center: %s" % nav)
        blank_line = Tag(soup, "p")
        nav.insert(0, blank_line)

def fixDefinitionLists(soup):
    """wxHtmlHelp can't handle multiple paragraphs in definition list <dd> tags.
    
    Definition list <dd> tags that don't have paragraph tags in them are
    left alone, because they seem to be rendered properly in the HtmlHelp
    controller.
    """
    dd_lists = soup.findAll("dd")
    replacement_list = []
    count = 0
    for dd in dd_lists:
        paragraphs = dd.findAll("p")
        if len(paragraphs) > 0:
            #print("Found paragraphs: %s" % paragraphs)
            # Add dummy paragraph that will get replaced in processing
            tag = Tag(soup, "p", attrs=[("id", "%d" % count)])
            count += 1
            dd.replaceWith(tag)
            replacement_list.append((tag, list(dd.contents)))
            #print("Found dt: %s" % tag.parent)
    
    for dummy, siblings in replacement_list:
        index = dummy.parent.contents.index(dummy)
        #print("Inserting at %d" % index)
        first = True
        replace = dummy
        after = None
        for sibling in siblings:
            #print("Content: %s" % sibling)
            if hasattr(sibling, 'contents'):
                tag = Tag(soup, "dd")
                if not first and sibling.name != 'pre':
                    #print("New Paragraph!")
                    p = Tag(soup, "p")
                    tag.insert(0, p)
                    insert_into = p
                else:
                    insert_into = tag
                if sibling.name == "p":
                    #print("Paragraph!")
                    i = 0
                    
                    for content in list(sibling.contents):
                        insert_into.insert(i, content)
                        i += 1
                elif sibling.name == "pre":
                    # Preformatted blocks aren't indented in the HtmlHelp
                    # viewer inside <dd> elements, but they *are* inside <ul>
                    # elements.  So, we fake it out by adding a wrapper <ul>
                    # element around the pre.
                    p = Tag(soup, "ul")
                    p.insert(0, sibling)
                    insert_into.insert(0, p)
                    tag = p
                else:
                    #print("Something other than a paragraph!: %s" % sibling.name)
                    insert_into.insert(0, sibling)
                index += 1
                if replace is not None:
                    dummy.replaceWith(tag)
                    replace = None
                    after = tag
                    first = False
                else:
                    after.append(tag)
            else:
                #print("Found -->%s<--" % sibling)
                if after is not None:
                    after.append(sibling)
            
    for dummy, dd in replacement_list:
        dummy.extract()

def fixIndexTargets(soup):
    """Fix targets of index hrefs
    
    This function is similar to L{addAnchors}; it converts <span> tags used
    as targets for index links into <a name> tags that are understood by the
    wxHtmlHelp system.
    """
    results = soup.findAll("span", attrs={"class": "target"})
    for result in list(results):
        for name, value in result.attrs:
            if name == "id":
                tag = Tag(soup, "a", attrs=[("name", result["id"])])
                result.replaceWith(tag)
                break



def replaceWithContents(tag):
    index = tag.parent.contents.index(tag)
    for i, content in enumerate(list(tag.contents)):
        tag.parent.insert(i + index, content)
    tag.extract()

conf_mapping = {}
def getConf(options):
    conf_path = os.path.join(options.confdir, "conf.py")
    if conf_path in conf_mapping:
        conf = conf_mapping[conf_path]
    else:
        conf = {}
        if os.path.exists(conf_path):
            execfile(conf_path, conf, conf)
        else:
            print("Error: can't find conf.py.  Some features will be unavailable")
        conf_mapping[conf_path] = conf
    return conf

def convertAll(dirname, func, options):
    """Convert all HTML files in the source directory
    
    """
    files = glob.glob("%s/*.html" % dirname)
    count = 0
    for file in files:
        if options.verbose:
            print("converting %s" % file)
        func(file, options)
        count += 1
    print("Processed %d files" % count)
    
    if options.destdir:
        copyStatic(dirname, options)
    else:
        removeJavascript(dirname, options)

def copyStatic(dirname, options):
    """Copy the static files to the destination directory"""
    # Find the configuration variables used in the sphinx project
    # FIXME: assumes that we are working in the same directory.
    conf = getConf(options)
    if not conf:
        print("Error: can't find conf.py to determine htmlhelp_basename")
        return

    for subdir in glob.glob("%s/_*" % dirname):
        if os.path.isdir(subdir):
            destsubdir = os.path.join(options.destdir, os.path.basename(subdir))
            if not os.path.exists(destsubdir):
                os.mkdir(destsubdir)
            for src in glob.glob("%s/*" % subdir):
                if options.verbose:
                    print "cp %s %s" % (src, os.path.join(destsubdir, os.path.basename(src)))
                shutil.copy(src, os.path.join(destsubdir, os.path.basename(src)))
    for src in glob.glob("%s/%s.*" % (dirname, conf['htmlhelp_basename'])):
        if options.verbose:
            print "cp %s %s" % (src, os.path.join(options.destdir, os.path.basename(src)))
        shutil.copy(src, os.path.join(options.destdir, os.path.basename(src)))
    
    removeJavascript(options.destdir, options)

def removeJavascript(dirname, options):
    """Remove unnecessary javascript and css files from the _static directory
    since they are unused after the conversion and just take up space.
    
    """
    conf = getConf(options)
    if not conf:
        print("Error: can't find conf.py to determine html_static_path")
        return
    
    static_paths = conf['html_static_path']
    if options.verbose:
        print("Removing Javascript and CSS from %s" % static_paths)
    for path in static_paths:
        subdir = os.path.join(dirname, path)
        #print(subdir)
        exts = ["*.js", "*.css"]
        for ext in exts:
            for filename in glob.glob(os.path.join(subdir, ext)):
                if options.verbose:
                    print("removing %s" % filename)
                os.remove(filename)

if __name__ == "__main__":
    from optparse import OptionParser
    usage = "usage: %prog [options] [<directory> <dir>... | <file> <file>...]\n\n" + __doc__
    parser = OptionParser(usage=usage)
    parser.add_option("-v", action="store_true", dest="verbose", default=False, help="Be verbose when operating")
    parser.add_option("-c", action="store", dest="confdir", default="", help="Specify the directory containing the .rst files if not running in that directory.")
    parser.add_option("-o", action="store", dest="destdir", default="", help="Specify an alternate directory for output files rather than overwriting the source htmlhelp files")
    parser.add_option("-t", action="store", dest="testprefix", default="", help="For testing purposes, use the prefix for all generated HTML files instead of overwriting the files")
    (options, args) = parser.parse_args()
    
    if not options.confdir:
        options.confdir = os.getcwd()
    
    if len(args) == 0:
        if os.path.isdir("_build/htmlhelp"):
            args = ["_build/htmlhelp"]
        else:
            builddir = os.path.join(options.confdir, "_build/htmlhelp")
            if os.path.isdir(builddir):
                args = [builddir]
            else:
                parser.print_help()
    
    for arg in args:
        if os.path.isdir(arg):
            convertAll(arg, convert, options)
        else:
            convert(arg, options)
